# Data manipulation
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import plotly.express as ptx
import seaborn as sns
# NLP and Text Processing
from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk import FreqDist
# Machine Learning Models
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
# Model Evaluation Metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Tokenization and Sequence Padding (for neural networks)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Visualization and Plotting (Additional)
import plotly.subplots as sp
import pydot
import graphviz
# Read csv file
dataset = pd.read_csv("./data/WELFake_Dataset.csv")
# Shows first five rows of dataset
dataset.head()
| Unnamed: 0 | title | text | label | |
|---|---|---|---|---|
| 0 | 0 | LAW ENFORCEMENT ON HIGH ALERT Following Threat... | No comment is expected from Barack Obama Membe... | 1 |
| 1 | 1 | NaN | Did they post their votes for Hillary already? | 1 |
| 2 | 2 | UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO... | Now, most of the demonstrators gathered last ... | 1 |
| 3 | 3 | Bobby Jindal, raised Hindu, uses story of Chri... | A dozen politically active pastors came here f... | 0 |
| 4 | 4 | SATAN 2: Russia unvelis an image of its terrif... | The RS-28 Sarmat missile, dubbed Satan 2, will... | 1 |
dataset.columns
Index(['Unnamed: 0', 'title', 'text', 'label'], dtype='object')
# Inspecting ratio of genuine to deceptive news in our dataset
custom_labels = dataset.label
print(f'Ratio of genuine and deceptive news:')
custom_labels.value_counts(normalize=True).rename({0: 'Deceptive', 1: 'Genuine'})
Ratio of genuine and deceptive news:
label Genuine 0.514404 Deceptive 0.485596 Name: proportion, dtype: float64
dataset.drop('Unnamed: 0', axis = 1, inplace = True)
dataset.head()
| title | text | label | |
|---|---|---|---|
| 0 | LAW ENFORCEMENT ON HIGH ALERT Following Threat... | No comment is expected from Barack Obama Membe... | 1 |
| 1 | NaN | Did they post their votes for Hillary already? | 1 |
| 2 | UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO... | Now, most of the demonstrators gathered last ... | 1 |
| 3 | Bobby Jindal, raised Hindu, uses story of Chri... | A dozen politically active pastors came here f... | 0 |
| 4 | SATAN 2: Russia unvelis an image of its terrif... | The RS-28 Sarmat missile, dubbed Satan 2, will... | 1 |
# Get count of missing values in each feature
missing_values_count = dataset.isnull().sum()
print(missing_values_count)
# Plot the proportion of missing values in the dataset
missing_values_count.plot(kind="barh", color='red')
plt.show()
title 558 text 39 label 0 dtype: int64
# Filling missing values in the 'text' column with 'no_text'
dataset['text'].fillna('no_text', inplace=True)
# Filling missing values in the 'title' column with 'no_title'
dataset['title'].fillna('no_title', inplace=True)
# Combine 'title' and 'text' columns into a new column 'merged'
dataset['merged'] = dataset['title'] + ' ' + dataset['text']
# Calculate the length of words in the 'merged' column, excluding spaces and occurrences of 'no_text' and 'no_title'
dataset["words_length"] = dataset["merged"].apply(lambda w: len(w) - w.count(" ") - w.count("no_text") - w.count("no_title"))
# Display the first few rows of the modified dataset
dataset.head()
| title | text | label | merged | words_length | |
|---|---|---|---|---|---|
| 0 | LAW ENFORCEMENT ON HIGH ALERT Following Threat... | No comment is expected from Barack Obama Membe... | 1 | LAW ENFORCEMENT ON HIGH ALERT Following Threat... | 4222 |
| 1 | no_title | Did they post their votes for Hillary already? | 1 | no_title Did they post their votes for Hillary... | 46 |
| 2 | UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO... | Now, most of the demonstrators gathered last ... | 1 | UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO... | 299 |
| 3 | Bobby Jindal, raised Hindu, uses story of Chri... | A dozen politically active pastors came here f... | 0 | Bobby Jindal, raised Hindu, uses story of Chri... | 6811 |
| 4 | SATAN 2: Russia unvelis an image of its terrif... | The RS-28 Sarmat missile, dubbed Satan 2, will... | 1 | SATAN 2: Russia unvelis an image of its terrif... | 1668 |
# Plot histograms of words_length for both Fake and Real news
plt.hist(dataset[dataset["label"] == 1]["words_length"], np.linspace(0, 200, 40), alpha=0.5, label="Fake News", color="red")
plt.hist(dataset[dataset["label"] == 0]["words_length"], np.linspace(0, 200, 40), alpha=0.5, label="Real News", color="green")
# Add legend to the plot indicating the categories
plt.legend(loc="upper left")
# Display the histogram
plt.show()
# Concatenate all titles into a single string
all_titles_concatenated = ' '.join(title for title in dataset['title'])
# Create a WordCloud object with specified parameters
wordcloud = WordCloud(
background_color='black',
max_words=20,
width=800,
height=400,
).generate(all_titles_concatenated)
# Display the WordCloud using matplotlib
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation='nearest')
plt.axis("off")
plt.show()
# Splitting the data into independent features and the target variable
features = dataset['merged']
labels = dataset['label']
# Extracting text fields based on the index for fake and real articles
fake_texts = ' '.join(dataset.iloc[index]['text'] for index in labels[labels == 0].index)
real_texts = ' '.join(dataset.iloc[index]['text'] for index in labels[labels == 1].index)
# Generating word clouds for the top 20 words in fake articles
fake_wordcloud = WordCloud(background_color='black', max_words=20, width=800, height=400).generate(fake_texts)
# Generating word clouds for the top 20 words in real articles
real_wordcloud = WordCloud(background_color='black', max_words=20, width=800, height=400).generate(real_texts)
# Plotting fake news word cloud
plt.figure(figsize=(20, 10))
plt.subplot(1, 2, 1)
plt.imshow(fake_wordcloud, interpolation='nearest')
plt.axis("off")
plt.title("Word Cloud for Fake News Articles")
# Plotting real news word cloud
plt.subplot(1, 2, 2)
plt.imshow(real_wordcloud, interpolation='nearest')
plt.axis("off")
plt.title("Word Cloud for Real News Articles")
plt.show()
# Vectorize the merged data which contains text and title
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(features)
# Split the data into training and testing sets with test size as 30%
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, labels , test_size=0.3, random_state=10)
# Initialize the Multinomial Naive Bayes classifier model
nb_classifier_model = MultinomialNB()
# Train the classifier
nb_classifier_model.fit(X_train, y_train)
# Predict on the test set
y_pred = nb_classifier_model.predict(X_test)
# Calculate accuracy
accuracy_nb = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_nb:.2f}")
# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
Accuracy: 0.84
Classification Report:
precision recall f1-score support
0 0.85 0.81 0.83 10437
1 0.83 0.86 0.85 11204
accuracy 0.84 21641
macro avg 0.84 0.84 0.84 21641
weighted avg 0.84 0.84 0.84 21641
# Calculate confusion matrix using y_test and y_pred
cm = confusion_matrix(y_test, y_pred)
# Plotting confusion matrix for Multinomial Naive Bayes classifier model
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Reds', fmt='g',
xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Multinomial Naive Bayes classifier model')
plt.show()
So, here's what I found when checking out how our Multinomial Naive Bayes classifier did. First off, we nailed an 84% accuracy, showing that the model pretty much got the labels right for most of the dataset – not too shabby! Digging into the details, the F1-score for spotting Fake news hit 85%, signaling a nice balance between precision and recall in catching those fake stories. Interestingly, it seems like the model has a bit of a knack for tagging Fake news ('1') more accurately than Real news ('0'). This shows up in the higher recall and F1-score for Fake news, suggesting our model is pretty good at sniffing out misinformation.
logreg_classifier_model = LogisticRegression(max_iter=100)
# Train the classifier
logreg_classifier_model.fit(X_train, y_train)
# Predict on the test set
y_pred = logreg_classifier_model.predict(X_test)
# Calculate accuracy
accuracy_lr = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_lr:.2f}")
# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))
Accuracy: 0.94
Classification Report:
precision recall f1-score support
0 0.94 0.93 0.94 10437
1 0.94 0.95 0.94 11204
accuracy 0.94 21641
macro avg 0.94 0.94 0.94 21641
weighted avg 0.94 0.94 0.94 21641
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Reds', fmt='g',
xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Logistic Regression')
plt.show()
So, after giving Logistic Regression a spin for sorting out fake news, I was blown away by the results. The accuracy skyrocketed from 84% to a whopping 94%, showing some serious improvement. Dive a bit deeper, and you'll find the F1-scores for both labels comfortably sitting at 94%, which is pretty sweet – a perfect balance between nabbing fake and real news. The real star of the show, though, is the Logistic Regression model. It's like the superhero of news classification, outshining the Multinomial Naive Bayes classifier. With higher accuracy, precision, and recall for both fake and real news, it's clear that this model means business and is super effective in the classification game. Hands down, the champ of the classification world!
# Initialize the Support Vector Machines classifier
svm_classifier_model = SVC(kernel='linear')
# Train the classifier
svm_classifier_model.fit(X_train, y_train)
# Predict on the test set
y_pred = svm_classifier_model.predict(X_test)
# Calculate accuracy
accuracy_svm = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_svm:.2f}")
# Display classification report for support vector machine
print("Classification Report for SVM:")
print(classification_report(y_test, y_pred))
Accuracy: 0.94
Classification Report for SVM:
precision recall f1-score support
0 0.95 0.93 0.94 10437
1 0.94 0.96 0.95 11204
accuracy 0.94 21641
macro avg 0.95 0.94 0.94 21641
weighted avg 0.94 0.94 0.94 21641
# Calculate confusion matrix for SVM
cm = confusion_matrix(y_test, y_pred)
# Plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Reds', fmt='g',
xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Support vector machine ')
plt.show()
So, diving into the nitty-gritty, the SVM model pulled off a solid 94% accuracy, just like its buddy Logistic Regression. Looking at the results, the recall for fake news hits a cool 96%, suggesting this model is a pro at spotting fake news within the actual bunch of fake news articles. The SVM model seems to be playing in the same league as Logistic Regression, both rocking a 94% accuracy. They're like the dynamic duo of distinguishing between fake and real news. Talk about a tag team in action.
# Define the parameter grid to try different estimators
param_grid = {
'n_estimators': [5, 10, 20, 50, 100]
}
# Initialize a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=10)
# Initialize GridSearchCV and we select scoring as "accuracy"
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring='accuracy')
# Performing grid search
grid_search.fit(X_train, y_train)
# Finding best estimator
best_estimator = grid_search.best_estimator_
best_params = grid_search.best_params_
# Print best estimator
print("Best Parameters:", best_params)
Best Parameters: {'n_estimators': 100}
# Implement random forest using best estimator
rf_classifier_best = RandomForestClassifier(n_estimators=best_params['n_estimators'], random_state=10)
# Fit data on classifier
rf_classifier_best.fit(X_train, y_train)
# Predict on the test set
y_pred = rf_classifier_best.predict(X_test)
# Calculate accuracy
accuracy_rf = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy_rf:.2f}")
# Display classification report for random forest algorithm
print("Classification Report - Random Forest Algorithm:")
print(classification_report(y_test, y_pred))
Accuracy: 0.95
Classification Report - Random Forest Algorithm:
precision recall f1-score support
0 0.97 0.93 0.95 10437
1 0.94 0.97 0.95 11204
accuracy 0.95 21641
macro avg 0.95 0.95 0.95 21641
weighted avg 0.95 0.95 0.95 21641
# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Reds', fmt='g',
xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Random Forest Algorithm')
plt.show()
Alright, diving into the details, I decided to give the Random Forest model a power-up using grid search. Turns out, the sweet spot for "n_estimators" is 100, and bingo – we hit a stellar 95% accuracy, leaving Logistic Regression and SVM in the dust. But here's the cool part – the recall value for Fake news soared to 97%, making this model a real pro at catching fake news within the mix of actual fake news articles. It's rocking a higher recall than both Logistic Regression and SVM. When I stack it up against Logistic Regression, SVM, and even Naive Bayes, the Random Forest model steals the spotlight. With the highest accuracy and a standout performance in pinpointing fake news articles
# Get independent and target variable
X = dataset['merged']
y = dataset['label']
# Perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
tokenizer = Tokenizer(num_words = 500, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# Perform sequencing on X_train
train_sequences = tokenizer.texts_to_sequences(X_train)
# Perform paddding on sequenced data with maxlen as 50
train_padded = pad_sequences(train_sequences, maxlen=50, padding="post", truncating="post")
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=50, padding="post", truncating="post")
# Reverse index with key value pair
reverse_idx = [(value,key) for (key,value) in word_index.items()]
reverse_idx[:20]
[(1, '<OOV>'), (2, 'the'), (3, 'to'), (4, 'of'), (5, 'and'), (6, 'a'), (7, 'in'), (8, 'that'), (9, 'is'), (10, 'for'), (11, 'on'), (12, 'it'), (13, 'he'), (14, 'with'), (15, 's'), (16, 'was'), (17, 'as'), (18, 'said'), (19, 'by'), (20, 'trump')]
# Neural network approach to classify news
model = tf.keras.Sequential([
tf.keras.layers.Embedding(500, 16, input_length=50),
tf.keras.layers.GlobalAveragePooling1D(),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile model with optimizer as 'adam'
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# Print model summary
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 50, 16) 8000
global_average_pooling1d ( (None, 16) 0
GlobalAveragePooling1D)
dense (Dense) (None, 6) 102
dense_1 (Dense) (None, 1) 7
=================================================================
Total params: 8109 (31.68 KB)
Trainable params: 8109 (31.68 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
# Model history to plot epoch to visualize ideal epoch values
history = model.fit(train_padded, y_train, epochs=30, validation_data=(test_padded, y_test))
Epoch 1/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.3288 - accuracy: 0.8789 - val_loss: 0.2036 - val_accuracy: 0.9186 Epoch 2/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1904 - accuracy: 0.9238 - val_loss: 0.1870 - val_accuracy: 0.9247 Epoch 3/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1798 - accuracy: 0.9286 - val_loss: 0.1814 - val_accuracy: 0.9261 Epoch 4/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1767 - accuracy: 0.9299 - val_loss: 0.1787 - val_accuracy: 0.9267 Epoch 5/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1754 - accuracy: 0.9306 - val_loss: 0.1787 - val_accuracy: 0.9269 Epoch 6/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1747 - accuracy: 0.9301 - val_loss: 0.1837 - val_accuracy: 0.9250 Epoch 7/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1744 - accuracy: 0.9306 - val_loss: 0.1775 - val_accuracy: 0.9268 Epoch 8/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1743 - accuracy: 0.9297 - val_loss: 0.1776 - val_accuracy: 0.9269 Epoch 9/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1741 - accuracy: 0.9300 - val_loss: 0.1772 - val_accuracy: 0.9270 Epoch 10/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1737 - accuracy: 0.9307 - val_loss: 0.1780 - val_accuracy: 0.9265 Epoch 11/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1739 - accuracy: 0.9310 - val_loss: 0.1771 - val_accuracy: 0.9271 Epoch 12/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1735 - accuracy: 0.9308 - val_loss: 0.1777 - val_accuracy: 0.9268 Epoch 13/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1732 - accuracy: 0.9309 - val_loss: 0.1794 - val_accuracy: 0.9262 Epoch 14/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1732 - accuracy: 0.9302 - val_loss: 0.1781 - val_accuracy: 0.9260 Epoch 15/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1730 - accuracy: 0.9310 - val_loss: 0.1775 - val_accuracy: 0.9268 Epoch 16/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1727 - accuracy: 0.9313 - val_loss: 0.1785 - val_accuracy: 0.9263 Epoch 17/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1730 - accuracy: 0.9306 - val_loss: 0.1777 - val_accuracy: 0.9273 Epoch 18/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1724 - accuracy: 0.9312 - val_loss: 0.1774 - val_accuracy: 0.9281 Epoch 19/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1725 - accuracy: 0.9305 - val_loss: 0.1762 - val_accuracy: 0.9269 Epoch 20/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1722 - accuracy: 0.9313 - val_loss: 0.1767 - val_accuracy: 0.9260 Epoch 21/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1722 - accuracy: 0.9309 - val_loss: 0.1767 - val_accuracy: 0.9269 Epoch 22/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1717 - accuracy: 0.9313 - val_loss: 0.1761 - val_accuracy: 0.9272 Epoch 23/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1716 - accuracy: 0.9312 - val_loss: 0.1769 - val_accuracy: 0.9271 Epoch 24/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1715 - accuracy: 0.9314 - val_loss: 0.1760 - val_accuracy: 0.9269 Epoch 25/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1712 - accuracy: 0.9316 - val_loss: 0.1760 - val_accuracy: 0.9273 Epoch 26/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1709 - accuracy: 0.9323 - val_loss: 0.1769 - val_accuracy: 0.9269 Epoch 27/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1705 - accuracy: 0.9316 - val_loss: 0.1781 - val_accuracy: 0.9277 Epoch 28/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1703 - accuracy: 0.9316 - val_loss: 0.1751 - val_accuracy: 0.9266 Epoch 29/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1700 - accuracy: 0.9316 - val_loss: 0.1747 - val_accuracy: 0.9275 Epoch 30/30 1578/1578 [==============================] - 2s 1ms/step - loss: 0.1695 - accuracy: 0.9321 - val_loss: 0.1743 - val_accuracy: 0.9287
# Data frame store train accuracy and validation accuracy
accuracy_df = pd.DataFrame({'accuracy': history.history["accuracy"], 'val_accuracy': history.history["val_accuracy"]})
# Plot for train vs validation accuracy
fig = ptx.line(accuracy_df, x=accuracy_df.index, y=['accuracy', 'val_accuracy'],
title='Train Accuracy Vs Validation Accuracy',
color_discrete_sequence=["green", "orange"])
fig.update_layout(xaxis_title="Epochs", yaxis_title="Accuracy")
fig.show()
# Dataframe to store train loss and validation loss
loss_df = pd.DataFrame({'loss': history.history["loss"], 'val_loss': history.history["val_loss"]})
# Plot of Train loss vs Validation loss
fig = ptx.line(loss_df, x=loss_df.index, y=['loss', 'val_loss'],
title='Train Loss Vs Validation Loss',color_discrete_sequence=["green", "orange"])
fig.update_layout(xaxis_title="Epochs", yaxis_title="Loss")
fig.show()
# Store predicted values
preds = model.predict(test_padded)
677/677 [==============================] - 1s 686us/step
# round_result method set value of element to 1 if it's greater than 0.5
def round_result(num):
ret = 0
if num > 0.5:
ret = 1
return ret
predicted_result = list(map(round_result, preds.flatten()))
# Calculate accuracy
accuracy_nn = accuracy_score(y_test, predicted_result)
print(f"Accuracy: {accuracy_nn:.2f}")
# Display classification report
print("Neural network performance:")
print(classification_report(y_test, predicted_result))
Accuracy: 0.93
Neural network performance:
precision recall f1-score support
0 0.94 0.91 0.92 10437
1 0.92 0.95 0.93 11204
accuracy 0.93 21641
macro avg 0.93 0.93 0.93 21641
weighted avg 0.93 0.93 0.93 21641
# Calculate confusion matrix for neural network
cm = confusion_matrix(y_test, predicted_result)
# Plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Reds', fmt='g',
xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Neural network')
plt.show()
Looking at the neural network, it did pretty well with a 93% accuracy, but it couldn't quite beat the Random Forest. What stands out is that it's good at catching fake news – the recall value for fake news is 95%, meaning it's solid at spotting fake news within the actual batch of fake news articles. Now, here's the thing – the neural network has potential, but we can make it even better. By adjusting some settings, trying different approaches, or maybe changing how it understands words, we might boost its performance. As part of our exploration, we also gave a shot to a Long Short-Term Memory (LSTM) neural network to better understand the sequence of information in the data. It's all about tweaking things and trying out different ideas to see how we can make it even sharper!
# Implementation of Neural netowrk with extra LSTM layer
model_lstm = tf.keras.Sequential([
tf.keras.layers.Embedding(500, 16, input_length=50),
tf.keras.layers.LSTM(64),
tf.keras.layers.Dense(6, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# Compile model keeping optimizer as 'adam'
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# Print model summary
model_lstm.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 50, 16) 8000
lstm (LSTM) (None, 64) 20736
dense_2 (Dense) (None, 6) 390
dense_3 (Dense) (None, 1) 7
=================================================================
Total params: 29133 (113.80 KB)
Trainable params: 29133 (113.80 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
# Storing history visualize the ideal number of epochs needed.
history_lstm = model_lstm.fit(train_padded, y_train, epochs=30, validation_data=(test_padded, y_test))
Epoch 1/30 1578/1578 [==============================] - 20s 12ms/step - loss: 0.2454 - accuracy: 0.8964 - val_loss: 0.1750 - val_accuracy: 0.9276 Epoch 2/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.1752 - accuracy: 0.9294 - val_loss: 0.1758 - val_accuracy: 0.9305 Epoch 3/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.1636 - accuracy: 0.9348 - val_loss: 0.1598 - val_accuracy: 0.9343 Epoch 4/30 1578/1578 [==============================] - 20s 13ms/step - loss: 0.1535 - accuracy: 0.9379 - val_loss: 0.1556 - val_accuracy: 0.9339 Epoch 5/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.1467 - accuracy: 0.9401 - val_loss: 0.1580 - val_accuracy: 0.9326 Epoch 6/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.1398 - accuracy: 0.9432 - val_loss: 0.1526 - val_accuracy: 0.9372 Epoch 7/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.1340 - accuracy: 0.9450 - val_loss: 0.1538 - val_accuracy: 0.9364 Epoch 8/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.1284 - accuracy: 0.9478 - val_loss: 0.1519 - val_accuracy: 0.9384 Epoch 9/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.1243 - accuracy: 0.9509 - val_loss: 0.1496 - val_accuracy: 0.9373 Epoch 10/30 1578/1578 [==============================] - 21s 13ms/step - loss: 0.1191 - accuracy: 0.9523 - val_loss: 0.1480 - val_accuracy: 0.9392 Epoch 11/30 1578/1578 [==============================] - 20s 13ms/step - loss: 0.1133 - accuracy: 0.9553 - val_loss: 0.1512 - val_accuracy: 0.9358 Epoch 12/30 1578/1578 [==============================] - 20s 13ms/step - loss: 0.1099 - accuracy: 0.9571 - val_loss: 0.1500 - val_accuracy: 0.9375 Epoch 13/30 1578/1578 [==============================] - 20s 12ms/step - loss: 0.1054 - accuracy: 0.9592 - val_loss: 0.1596 - val_accuracy: 0.9382 Epoch 14/30 1578/1578 [==============================] - 20s 13ms/step - loss: 0.1008 - accuracy: 0.9607 - val_loss: 0.1529 - val_accuracy: 0.9350 Epoch 15/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0974 - accuracy: 0.9627 - val_loss: 0.1581 - val_accuracy: 0.9368 Epoch 16/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0937 - accuracy: 0.9641 - val_loss: 0.1599 - val_accuracy: 0.9369 Epoch 17/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0888 - accuracy: 0.9661 - val_loss: 0.1663 - val_accuracy: 0.9342 Epoch 18/30 1578/1578 [==============================] - 20s 13ms/step - loss: 0.0864 - accuracy: 0.9675 - val_loss: 0.1692 - val_accuracy: 0.9360 Epoch 19/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0822 - accuracy: 0.9693 - val_loss: 0.1836 - val_accuracy: 0.9315 Epoch 20/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0780 - accuracy: 0.9714 - val_loss: 0.1737 - val_accuracy: 0.9394 Epoch 21/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0740 - accuracy: 0.9731 - val_loss: 0.1873 - val_accuracy: 0.9392 Epoch 22/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0691 - accuracy: 0.9754 - val_loss: 0.1895 - val_accuracy: 0.9346 Epoch 23/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0680 - accuracy: 0.9756 - val_loss: 0.1854 - val_accuracy: 0.9373 Epoch 24/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0660 - accuracy: 0.9766 - val_loss: 0.1923 - val_accuracy: 0.9393 Epoch 25/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0588 - accuracy: 0.9795 - val_loss: 0.1943 - val_accuracy: 0.9357 Epoch 26/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0581 - accuracy: 0.9797 - val_loss: 0.2101 - val_accuracy: 0.9355 Epoch 27/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0541 - accuracy: 0.9818 - val_loss: 0.2185 - val_accuracy: 0.9340 Epoch 28/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0498 - accuracy: 0.9834 - val_loss: 0.2362 - val_accuracy: 0.9344 Epoch 29/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0499 - accuracy: 0.9835 - val_loss: 0.2260 - val_accuracy: 0.9375 Epoch 30/30 1578/1578 [==============================] - 19s 12ms/step - loss: 0.0476 - accuracy: 0.9840 - val_loss: 0.2174 - val_accuracy: 0.9372
# Calculate accuary of LSTM neural network
accuracy_df_lstm = pd.DataFrame({'accuracy': history_lstm.history["accuracy"], 'val_accuracy': history_lstm.history["val_accuracy"]})
# Plot for training accuary vs validation accuracy
fig_lstm = ptx.line(accuracy_df_lstm, x=accuracy_df_lstm.index, y=['accuracy', 'val_accuracy'],
title='Train Accuracy Vs Validation Accuracy',
color_discrete_sequence=["green", "orange"])
fig_lstm.update_layout(xaxis_title="Epochs", yaxis_title="Accuracy")
fig_lstm.show()
# Dataframe to store training loss and validation loss
loss_df_lstm = pd.DataFrame({'loss': history_lstm.history["loss"], 'val_loss': history_lstm.history["val_loss"]})
# Plot of validation loss vs training loss
fig_lstm = ptx.line(loss_df_lstm, x=loss_df_lstm.index, y=['loss', 'val_loss'],
title='Train Loss Vs Validation Loss',color_discrete_sequence=["green", "orange"])
fig_lstm.update_layout(xaxis_title="Epochs", yaxis_title="Loss")
fig_lstm.show()
predicted_result_lstm = list(map(round_result, model_lstm.predict(test_padded).flatten()))
677/677 [==============================] - 3s 4ms/step
# Calculate accuracy
accuracy_lstm = accuracy_score(y_test, predicted_result_lstm)
print(f"Accuracy: {accuracy_lstm:.2f}")
# Display classification report
print("LSTM Neural network performance:")
print(classification_report(y_test, predicted_result_lstm))
Accuracy: 0.94
LSTM Neural network performance:
precision recall f1-score support
0 0.95 0.92 0.93 10437
1 0.93 0.95 0.94 11204
accuracy 0.94 21641
macro avg 0.94 0.94 0.94 21641
weighted avg 0.94 0.94 0.94 21641
# Calculate confusion matrix for neural network
cm = confusion_matrix(y_test, predicted_result_lstm)
# Plotting confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, cmap='Reds', fmt='g',
xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - LSTM Neural network')
plt.show()
Checking out the impact of adding the LSTM layer to our neural network for fake news classification, things are looking up. The accuracy got a bump to 94%, which is a noticeable improvement. Digging into the details, the precision for fake news sits at 92%, showing the model's knack for accurately predicting fake news. However, it's worth noting that this precision is a bit lower compared to its precision for real news. Now, comparing this to the neural network with just the "Embedding" layer, the LSTM-based neural network steps up its game, particularly in improving the recall for real news. It's a positive move, showcasing the strengths of the LSTM layer in enhancing certain aspects of the model's performance.
print(f"Accuracy - Naive Bayes : {accuracy_nb:.2f}")
print(f"Accuracy - Logistic regression: {accuracy_lr:.2f}")
print(f"Accuracy - Support Vector Machine: {accuracy_svm:.2f}")
print(f"Accuracy - Random Forest Algorithm: {accuracy_rf:.2f}")
print(f"Accuracy - Neural network : {accuracy_nn:.2f}")
print(f"Accuracy - LSMT Neural network: {accuracy_lstm:.2f}")
Accuracy - Naive Bayes : 0.84 Accuracy - Logistic regression: 0.94 Accuracy - Support Vector Machine: 0.94 Accuracy - Random Forest Algorithm: 0.95 Accuracy - Neural network : 0.93 Accuracy - LSMT Neural network: 0.94
In our project on fake news classification, we employed exploratory data analysis, effective data preprocessing, and various modeling techniques to categorize fake news. Utilizing a diverse set of machine learning algorithms, ranging from traditional methods like Naive Bayes, Logistic Regression, SVM, and Random Forest to more complex neural network architectures, including an LSTM-based network, we aimed to identify the most effective approach. Among the traditional machine learning models, the Random Forest Classifier emerged as the top performer, achieving an impressive accuracy of 95%. Nevertheless, both the LSTM neural network and SVM displayed competitive performance, each boasting an accuracy of 94%. These results underscore the effectiveness of different modeling techniques in discerning between real and fake news articles, with both neural networks and ensemble methods showcasing promising results. In summary, our project provides a comprehensive exploration of various approaches for fake news classification, shedding light on the strengths and performances of different models. The findings highlight the potential of employing diverse strategies to effectively distinguish between genuine and deceptive news articles.